Part 1 - EDA¶

In [2]:
# Libraries
import pandas as pd
import numpy as np
import matplotlib as mpl
import scipy, scipy.stats
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.formula.api as sm
import statsmodels.formula.api as smf
import statsmodels.api as sm 
import statsmodels.formula.api as smf 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import (
    ConfusionMatrixDisplay

)
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from statsmodels.graphics.regressionplots import plot_regress_exog, plot_fit, plot_leverage_resid2, influence_plot
from scipy.stats import shapiro, levene, mannwhitneyu
from scipy.stats import chi2_contingency
from sklearn.model_selection import learning_curve
from scipy.stats import skew, kurtosis
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
In [3]:
df = pd.read_csv('insurance.csv')
df.info()
print(df.describe())
print(df.nunique())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
               age          bmi     children       charges
count  1338.000000  1338.000000  1338.000000   1338.000000
mean     39.207025    30.663397     1.094918  13270.422265
std      14.049960     6.098187     1.205493  12110.011237
min      18.000000    15.960000     0.000000   1121.873900
25%      27.000000    26.296250     0.000000   4740.287150
50%      39.000000    30.400000     1.000000   9382.033000
75%      51.000000    34.693750     2.000000  16639.912515
max      64.000000    53.130000     5.000000  63770.428010
age           47
sex            2
bmi          548
children       6
smoker         2
region         4
charges     1337
dtype: int64
In [4]:
df.head()
Out[4]:
age sex bmi children smoker region charges
0 19 female 27.900 0 yes southwest 16884.92400
1 18 male 33.770 1 no southeast 1725.55230
2 28 male 33.000 3 no southeast 4449.46200
3 33 male 22.705 0 no northwest 21984.47061
4 32 male 28.880 0 no northwest 3866.85520
In [5]:
df.tail()
Out[5]:
age sex bmi children smoker region charges
1333 50 male 30.97 3 no northwest 10600.5483
1334 18 female 31.92 0 no northeast 2205.9808
1335 18 female 36.85 0 no southeast 1629.8335
1336 21 female 25.80 0 no southwest 2007.9450
1337 61 female 29.07 0 yes northwest 29141.3603
In [6]:
duplicates = len(df[df.duplicated()])

missing_values = df.isnull().sum().sum()

print('Number of Duplicate Entries: %d'%(duplicates))
print('Number of Missing Values: %d'%(missing_values))
print('Number of Features: %d'%(df.shape[1]))
print('Number of Observations: %d'%(df.shape[0]))
Number of Duplicate Entries: 1
Number of Missing Values: 0
Number of Features: 7
Number of Observations: 1338
In [7]:
print("Object type columns: ",list(df.select_dtypes(include='object')))
print("Number type columns: ", list(df.select_dtypes(include='number')))
Object type columns:  ['sex', 'smoker', 'region']
Number type columns:  ['age', 'bmi', 'children', 'charges']
In [8]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()
Out[8]:
0
In [9]:
print(df.shape)
print(df.size)
(1337, 7)
9359
In [10]:
def pie_plot(column, ax):
    ax.pie(df[column].value_counts(), autopct="%0.2f%%",
           labels=df[column].value_counts().index,
           colors=['#1f77b4', '#3498db', '#74b9ff'])
    ax.set(title=f"Pie Chart of {column}")
In [11]:
columns = ["sex", "smoker", "region",'children']
fig, axes = plt.subplots(1, len(columns), figsize=(12, 4))

for i, column in enumerate(columns):
    pie_plot(column, axes[i])

plt.tight_layout()
plt.show()
No description has been provided for this image
In [12]:
df.boxplot(column=['age', 'bmi', 'children', 'charges'])
plt.title('Box plot of Numerical Columns')
plt.show()
No description has been provided for this image
In [13]:
df.boxplot(column='charges', by='smoker', showmeans = True, meanline = True, medianprops = dict(color = "blue", linewidth = 1.5))

df.boxplot(column='charges', by='smoker', showfliers = False, showmeans = True, meanline = True, medianprops = dict(color = "blue", linewidth = 1.5))
plt.xlabel('Smoker')
plt.ylabel('Charges')
plt.title('Charges Distribution for Smokers and Non-Smokers')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [14]:
numerical_vars = ['age', 'bmi', 'charges']
plt.figure(figsize=(15, 5))
for i, var in enumerate(numerical_vars):
    plt.subplot(1, 3, i+1)
    sns.boxplot(y=df[var], showmeans = True, meanline = True, medianprops = dict(color = "blue", linewidth = 1.5), color="azure")
    plt.title(f'Box Plot of {var}')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [15]:
df.region.value_counts().plot(kind='bar', color=sns.color_palette("ch:s=.25,rot=-.25"))

plt.xlabel('Region')

plt.ylabel('Total Charges')
plt.title('Total Charges by Region')

plt.xticks(rotation=45)
plt.xticks(rotation=0)
Out[15]:
(array([0, 1, 2, 3]),
 [Text(0, 0, 'southeast'),
  Text(1, 0, 'southwest'),
  Text(2, 0, 'northwest'),
  Text(3, 0, 'northeast')])
No description has been provided for this image
In [16]:
df.plot(kind='scatter', x='bmi', y='charges', color='#3498db')
plt.xlabel('BMI')
plt.ylabel('Charges')
plt.title('Charges by BMI')
plt.show()
No description has been provided for this image
In [17]:
df.groupby(['region'])\
    ['charges'].agg(['mean','median', 'min', 'max']).round(2)
Out[17]:
mean median min max
region
northeast 13406.38 10057.65 1694.80 58571.07
northwest 12450.84 8976.98 1621.34 60021.40
southeast 14735.41 9294.13 1121.87 63770.43
southwest 12346.94 8798.59 1241.56 52590.83
In [18]:
bins = [0,18.5,25,30,60]
bin_labels = ['underweight','normal','overweight', 'obese']

df['bmi_group'] = pd.cut(df.bmi, bins, right=False, labels = bin_labels)

df.bmi_group.value_counts().sort_index()
sns.scatterplot(x="bmi_group", y="charges", hue="bmi_group", data=df)
plt.xlabel('BMI Group')
plt.ylabel('Charges')
plt.title('Charges by BMI')
Out[18]:
Text(0.5, 1.0, 'Charges by BMI')
No description has been provided for this image
In [19]:
pd.cut(df['bmi'], [0,18.5,25,30,60], labels=["underweight", "normal", "overweight", "obese"]).value_counts()
Out[19]:
bmi
obese          704
overweight     386
normal         226
underweight     21
Name: count, dtype: int64
In [20]:
def percetage_data(df=None, column=None,sort=True):
    value_counts = df[column].value_counts(sort=sort)
    df_percentage = pd.DataFrame({'count': value_counts,
                                  'percentage': round(value_counts *100 / len(df),2)
                                 })

    return df_percentage

percetage_data(df,'bmi_group')
Out[20]:
count percentage
bmi_group
obese 706 52.80
overweight 386 28.87
normal 225 16.83
underweight 20 1.50
In [21]:
df.groupby(['bmi_group'])\
    ['charges'].agg(['mean','median', 'min', 'max']).round(2)
C:\Users\chuch\AppData\Local\Temp\ipykernel_15788\2137049437.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  df.groupby(['bmi_group'])\
Out[21]:
mean median min max
bmi_group
underweight 8852.20 6759.26 1621.34 32734.19
normal 10409.34 8603.82 1121.87 35069.37
overweight 10987.51 8659.38 1252.41 38245.59
obese 15572.04 10003.65 1131.51 63770.43
In [22]:
percetage_data(df,['bmi_group','smoker'], sort=False)
Out[22]:
count percentage
bmi_group smoker
underweight no 15 1.12
yes 5 0.37
normal no 175 13.09
yes 50 3.74
overweight no 312 23.34
yes 74 5.53
obese no 561 41.96
yes 145 10.85
In [23]:
df.pivot_table(index='sex', columns='smoker', values='charges')
Out[23]:
smoker no yes
sex
female 8762.297300 30678.996276
male 8099.700161 33042.005975
In [24]:
sns.set(style="whitegrid")

fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(6, 4))

sns.histplot(df['age'], bins=30, kde=True, ax=axes[0, 0])
axes[0, 0].set_title('Age Distribution')

sns.histplot(df['bmi'], bins=30, kde=True, ax=axes[0, 1])
axes[0, 1].set_title('BMI Distribution')

sns.histplot(df['children'], bins=30, kde=True, ax=axes[1, 0])
axes[1, 0].set_title('Children Distribution')

sns.histplot(df['charges'], bins=30, kde=True, ax=axes[1, 1])
axes[1, 1].set_title('Charges Distribution')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [25]:
def detect_outlier(data,treshold=1.5):
    q1 = np.quantile(data,0.25)
    q3 = np.quantile(data,0.75)
    iqr = q3 - q1

    lower_bound = q1 - treshold * iqr
    upper_bound = q3 + treshold * iqr

    return lower_bound,upper_bound

low,up = detect_outlier(df["bmi"])
print(df[(df["bmi"] < low) | (df["bmi"] > up)])
index = df[(df["bmi"] < low) | (df["bmi"] > up)].index
df.drop(index=index,inplace=True)
df.reset_index(drop=True,inplace=True)
      age     sex    bmi  children smoker     region      charges bmi_group
116    58    male  49.06         0     no  southeast  11381.32540     obese
286    46  female  48.07         2     no  northeast   9432.92530     obese
401    47    male  47.52         1     no  southeast   8083.91980     obese
543    54  female  47.41         0    yes  southeast  63770.42801     obese
847    23    male  50.38         1     no  southeast   2438.05520     obese
860    37  female  47.60         2    yes  southwest  46113.51100     obese
1047   22    male  52.58         1    yes  southeast  44501.39820     obese
1088   52    male  47.74         1     no  southeast   9748.91060     obese
1317   18    male  53.13         0     no  southeast   1163.46270     obese
In [26]:
print(df.describe())

sns.histplot(df['bmi'], bins=30, kde=True)
               age          bmi     children       charges
count  1328.000000  1328.000000  1328.000000   1328.000000
mean     39.219127    30.537308     1.097139  13221.047810
std      14.042170     5.922176     1.208008  11997.547468
min      18.000000    15.960000     0.000000   1121.873900
25%      27.000000    26.220000     0.000000   4744.325050
50%      39.000000    30.300000     1.000000   9369.615750
75%      51.000000    34.488750     2.000000  16604.302645
max      64.000000    46.750000     5.000000  62592.873090
Out[26]:
<Axes: xlabel='bmi', ylabel='Count'>
No description has been provided for this image
In [27]:
from scipy.stats import skew

skewness_charges = skew(df['charges']).round(2)

print("Charges Skewness:", skewness_charges)
Charges Skewness: 1.5
In [28]:
from scipy.stats import skew

skewness_bmi = skew(df['bmi']).round(2)
skewness_age = skew(df['age']).round(2)
skewness_child = skew(df['children']).round(2)

print("Charges Skewness:", skewness_charges)
print("BMI Skewness:", skewness_bmi)
print("Age Skewness:", skewness_age)
print("Children Skewness:", skewness_child)
Charges Skewness: 1.5
BMI Skewness: 0.16
Age Skewness: 0.06
Children Skewness: 0.93
In [29]:
from scipy.stats import kurtosis

kurtosis_charges = kurtosis(df['charges']).round(2)
kurtosis_bmi = kurtosis(df['bmi']).round(2)
kurtosis_age = kurtosis(df['age']).round(2)
kurtosis_child = kurtosis(df['children']).round(2)

print("Charges Kurtosis:", kurtosis_charges)
print("BMI Kurtosis:", kurtosis_bmi)
print("Age Kurtosis:", kurtosis_age)
print("Children Kurtosis:", kurtosis_age)
Charges Kurtosis: 1.52
BMI Kurtosis: -0.37
Age Kurtosis: -1.24
Children Kurtosis: -1.24
In [30]:
df_males = df[df['sex'] == 'male']
df_females = df[df['sex'] == 'female']

fig, axes = plt.subplots(figsize=(6, 4))

female_plot = sns.histplot(df_females['bmi'], bins=30, kde=True,label = 'female', color='orange')
female_plot.set_title('Female BMI Distribution')

axes.set_title('BMI Distribution')
axes.legend()

plt.show()
No description has been provided for this image
In [31]:
df_females = df[df['sex'] == 'female']

df_females_0 = df_females[df_females['children'] == 0]
df_females_1 = df_females[df_females['children'] == 1]
df_females_2 = df_females[df_females['children'] == 2]

fig, axes = plt.subplots(figsize=(6, 4))

sns.histplot(df_females_0['bmi'], bins=30, kde=True, label = 'women with no children')
sns.histplot(df_females_1['bmi'], bins=30, kde=True,label = 'women with one child')
sns.histplot(df_females_2['bmi'], bins=30, kde=True,label = 'women with two children')

axes.set_title('BMI Distribution')
axes.legend()

plt.show()
No description has been provided for this image
In [32]:
df_females_0to2 = pd.DataFrame()

df_females_0to2= pd.concat([df_females_0to2, df_females_0])
df_females_0to2= pd.concat([df_females_0to2, df_females_1])
df_females_0to2= pd.concat([df_females_0to2, df_females_2])

sns.lmplot(df_females_0to2, x = 'bmi', y = 'charges', hue = 'children')
Out[32]:
<seaborn.axisgrid.FacetGrid at 0x26fdd3167b0>
No description has been provided for this image
In [33]:
df_females_0to2_smoke = df_females_0to2[df_females_0to2['smoker'] == 'yes']
sns.lmplot(df_females_0to2_smoke, x = 'bmi', y = 'charges', hue = 'children')
Out[33]:
<seaborn.axisgrid.FacetGrid at 0x26fdc82e420>
No description has been provided for this image
In [34]:
sns.lmplot(x="bmi", y="charges", hue="smoker", data=df)
Out[34]:
<seaborn.axisgrid.FacetGrid at 0x26fdc56e420>
No description has been provided for this image
In [35]:
df_females_0to2_smoke1 = df_females_0to2[df_females_0to2['children'] == 0]
df_females_0to2_smoke1 = df_females_0to2[df_females_0to2['children'] == 1]

sns.lmplot(df_females_0to2_smoke1, x = 'bmi', y = 'charges', hue = 'smoker')
Out[35]:
<seaborn.axisgrid.FacetGrid at 0x26fdfa73530>
No description has been provided for this image
In [36]:
def examine_trends(df):
  fig, axes = plt.subplots(nrows=3, ncols=3,figsize=(9, 9))

  sns.regplot(df, x = 'bmi', y = 'charges', ax = axes[0,0])
  axes[0, 0].set_title('bmi vs. charges')
  sns.regplot(df, x = 'bmi', y = 'age', ax = axes[0,1])
  axes[0, 1].set_title('bmi vs. age')
  sns.histplot(df['smoker'], ax = axes[0,2])

  df_smoke = df[df['smoker'] == 'yes']
  print(df_smoke.head())

  sns.regplot(df_smoke, x = 'bmi', y = 'charges', ax = axes[1,0])
  axes[1,0].set_title('smokers: bmi vs charges')

  sns.histplot(df_smoke['bmi_group'], ax = axes[1,1])
  axes[1,1].set_xticklabels(axes[1,1].get_xticklabels(), rotation=40)
  axes[1,1].set_title('smokers: bmi_group')

  sns.histplot(df_smoke['region'], ax = axes[1,2])
  axes[1,2].set_xticklabels(axes[1,2].get_xticklabels(), rotation=40)
  axes[1,2].set_title('smokers: bmi_group')

  df_nonsmoke = df[df['smoker'] == 'no']
  print(df_nonsmoke.head())

  sns.regplot(df_nonsmoke, x = 'bmi', y = 'charges', ax = axes[2,0])
  axes[2,0].set_title('non-smokers: bmi vs charges')

  sns.histplot(df_nonsmoke['bmi_group'], ax = axes[2,1])
  axes[2,1].set_xticklabels(axes[1,1].get_xticklabels(), rotation=40)
  axes[2,1].set_title('non-smokers: region')

  sns.histplot(df_nonsmoke['region'], ax = axes[2,2])
  axes[2,2].set_xticklabels(axes[1,2].get_xticklabels(), rotation=40)
  axes[2,2].set_title('non-smokers: region')


  fig.subplots_adjust(wspace=0.4, hspace=1.0)

examine_trends(df_females_0to2)
    age     sex    bmi  children smoker     region     charges   bmi_group
0    19  female  27.90         0    yes  southwest  16884.9240  overweight
11   62  female  26.29         0    yes  southeast  27808.7251  overweight
64   20  female  22.42         0    yes  northwest  14711.7438      normal
70   27  female  24.75         0    yes  southeast  16577.7795      normal
86   57  female  31.16         0    yes  northwest  43578.9394       obese
    age     sex     bmi  children smoker     region      charges   bmi_group
5    31  female  25.740         0     no  southeast   3756.62160  overweight
9    60  female  25.840         0     no  northwest  28923.13692  overweight
13   56  female  39.820         0     no  southeast  11090.71780       obese
20   60  female  36.005         0     no  northeast  13228.84695       obese
26   63  female  23.085         0     no  northeast  14451.83515      normal
C:\Users\chuch\AppData\Local\Temp\ipykernel_15788\698283473.py:17: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  axes[1,1].set_xticklabels(axes[1,1].get_xticklabels(), rotation=40)
C:\Users\chuch\AppData\Local\Temp\ipykernel_15788\698283473.py:21: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  axes[1,2].set_xticklabels(axes[1,2].get_xticklabels(), rotation=40)
C:\Users\chuch\AppData\Local\Temp\ipykernel_15788\698283473.py:31: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  axes[2,1].set_xticklabels(axes[1,1].get_xticklabels(), rotation=40)
C:\Users\chuch\AppData\Local\Temp\ipykernel_15788\698283473.py:35: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  axes[2,2].set_xticklabels(axes[1,2].get_xticklabels(), rotation=40)
No description has been provided for this image
In [37]:
examine_trends(df_females_0)
    age     sex    bmi  children smoker     region     charges   bmi_group
0    19  female  27.90         0    yes  southwest  16884.9240  overweight
11   62  female  26.29         0    yes  southeast  27808.7251  overweight
64   20  female  22.42         0    yes  northwest  14711.7438      normal
70   27  female  24.75         0    yes  southeast  16577.7795      normal
86   57  female  31.16         0    yes  northwest  43578.9394       obese
    age     sex     bmi  children smoker     region      charges   bmi_group
5    31  female  25.740         0     no  southeast   3756.62160  overweight
9    60  female  25.840         0     no  northwest  28923.13692  overweight
13   56  female  39.820         0     no  southeast  11090.71780       obese
20   60  female  36.005         0     no  northeast  13228.84695       obese
26   63  female  23.085         0     no  northeast  14451.83515      normal
C:\Users\chuch\AppData\Local\Temp\ipykernel_15788\698283473.py:17: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  axes[1,1].set_xticklabels(axes[1,1].get_xticklabels(), rotation=40)
C:\Users\chuch\AppData\Local\Temp\ipykernel_15788\698283473.py:21: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  axes[1,2].set_xticklabels(axes[1,2].get_xticklabels(), rotation=40)
C:\Users\chuch\AppData\Local\Temp\ipykernel_15788\698283473.py:31: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  axes[2,1].set_xticklabels(axes[1,1].get_xticklabels(), rotation=40)
C:\Users\chuch\AppData\Local\Temp\ipykernel_15788\698283473.py:35: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  axes[2,2].set_xticklabels(axes[1,2].get_xticklabels(), rotation=40)
No description has been provided for this image
In [38]:
examine_trends(df_females_1)
     age     sex    bmi  children smoker     region     charges   bmi_group
23    34  female  31.92         1    yes  northeast  37701.8768       obese
58    53  female  22.88         1    yes  southeast  23244.7902      normal
116   29  female  27.94         1    yes  southeast  19107.7796  overweight
260   20  female  26.84         1    yes  southeast  17085.2676  overweight
279   40  female  28.12         1    yes  northeast  22331.5668  overweight
    age     sex     bmi  children smoker     region      charges   bmi_group
6    46  female  33.440         1     no  southeast   8240.58960       obese
16   52  female  30.780         1     no  northeast  10797.33620       obese
21   30  female  32.400         1     no  southwest   4149.73600       obese
63   28  female  25.935         1     no  northwest   4133.64165  overweight
76   29  female  29.590         1     no  southeast   3947.41310  overweight
C:\Users\chuch\AppData\Local\Temp\ipykernel_15788\698283473.py:17: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  axes[1,1].set_xticklabels(axes[1,1].get_xticklabels(), rotation=40)
C:\Users\chuch\AppData\Local\Temp\ipykernel_15788\698283473.py:21: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  axes[1,2].set_xticklabels(axes[1,2].get_xticklabels(), rotation=40)
C:\Users\chuch\AppData\Local\Temp\ipykernel_15788\698283473.py:31: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  axes[2,1].set_xticklabels(axes[1,1].get_xticklabels(), rotation=40)
C:\Users\chuch\AppData\Local\Temp\ipykernel_15788\698283473.py:35: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  axes[2,2].set_xticklabels(axes[1,2].get_xticklabels(), rotation=40)
No description has been provided for this image
In [39]:
examine_trends(df_females_2)
     age     sex     bmi  children smoker     region     charges    bmi_group
84    37  female  34.800         2    yes  southwest  39836.5190        obese
94    64  female  31.300         2    yes  southwest  47291.0550        obese
127   32  female  17.765         2    yes  northwest  32734.1863  underweight
234   40  female  22.220         2    yes  southeast  19444.2658       normal
239   23  female  36.670         2    yes  northeast  38511.6283        obese
    age     sex     bmi  children smoker     region      charges bmi_group
27   55  female  32.775         2     no  northwest  12268.63225     obese
41   31  female  36.630         2     no  southeast   4949.75870     obese
43   37  female  30.800         2     no  southeast   6313.75900     obese
46   18  female  38.665         2     no  northeast   3393.35635     obese
51   21  female  33.630         2     no  northwest   3579.82870     obese
C:\Users\chuch\AppData\Local\Temp\ipykernel_15788\698283473.py:17: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  axes[1,1].set_xticklabels(axes[1,1].get_xticklabels(), rotation=40)
C:\Users\chuch\AppData\Local\Temp\ipykernel_15788\698283473.py:21: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  axes[1,2].set_xticklabels(axes[1,2].get_xticklabels(), rotation=40)
C:\Users\chuch\AppData\Local\Temp\ipykernel_15788\698283473.py:31: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  axes[2,1].set_xticklabels(axes[1,1].get_xticklabels(), rotation=40)
C:\Users\chuch\AppData\Local\Temp\ipykernel_15788\698283473.py:35: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  axes[2,2].set_xticklabels(axes[1,2].get_xticklabels(), rotation=40)
No description has been provided for this image
In [40]:
df_females_3 = df_females[df_females['children'] == 3]
df_females_4 = df_females[df_females['children'] == 4]
df_females_5 = df_females[df_females['children'] == 5]

fig, axes = plt.subplots(nrows=3, ncols=2,figsize=(10, 8))

sns.histplot(df_females_0['bmi'], bins=30, kde=True, label = 'women with no children',ax=axes[0, 0])
axes[0, 0].set_title('women with no children')
sns.histplot(df_females_1['bmi'], bins=30, kde=True,label = 'women with one child',ax=axes[0, 1])
axes[0, 1].set_title('women with one children')
sns.histplot(df_females_2['bmi'], bins=30, kde=True,label = 'women with two children',ax=axes[1, 0])
axes[1, 0].set_title('women with two children')
sns.histplot(df_females_3['bmi'], bins=30, kde=True,label = 'women with three children',ax=axes[1, 1])
axes[1, 1].set_title('women with three children')
sns.histplot(df_females_4['bmi'], bins=30, kde=True,label = 'women with four children',ax=axes[2, 0])
axes[2, 0].set_title('women with four children')
sns.histplot(df_females_5['bmi'], bins=30, kde=True,label = 'women with five children',ax=axes[2, 1])
axes[2, 1].set_title('women with five children')

fig.subplots_adjust(wspace=0.5, hspace=1)
plt.show()
No description has been provided for this image
In [41]:
df_smokers_m = df_males[df_males['smoker'] == 'yes']

df_smokers_f = df_females[df_females['smoker'] == 'yes']

fig, axes = plt.subplots(figsize=(4, 4))
sns.histplot(df_smokers_f['charges'], bins=30, kde=True,label = 'female smokers')
sns.histplot(df_smokers_m['charges'], bins=30, kde=True, label = 'male smokers')

axes.set_title('Smokers distribution')
axes.legend()
Out[41]:
<matplotlib.legend.Legend at 0x26fe12b5070>
No description has been provided for this image
In [42]:
df_smokers = df[df['smoker'] == 'yes']

df_smokers_m = df_smokers[df_smokers['sex']== 'male']
df_smokers_f = df_smokers[df_smokers['sex']== 'female']

df_smokers_m_obese = df_smokers_m[df_smokers['bmi_group']== 'obese']
df_smokers_f_obese = df_smokers_f[df_smokers['bmi_group']== 'obese']


fig, axes = plt.subplots(figsize=(4, 4))
sns.histplot(df_smokers_m_obese['charges'], bins=30, kde=True,label = 'male smokers')
sns.histplot(df_smokers_f_obese['charges'], bins=30, kde=True, label = 'female smokers')

axes.set_title('Obese smokers charges distribution by sex')
axes.legend()
C:\Users\chuch\AppData\Local\Temp\ipykernel_15788\252698876.py:6: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  df_smokers_m_obese = df_smokers_m[df_smokers['bmi_group']== 'obese']
C:\Users\chuch\AppData\Local\Temp\ipykernel_15788\252698876.py:7: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  df_smokers_f_obese = df_smokers_f[df_smokers['bmi_group']== 'obese']
Out[42]:
<matplotlib.legend.Legend at 0x26fdd00bb00>
No description has been provided for this image
In [43]:
print(df_smokers_m.head())

sns.lmplot(df_smokers_m, x ='age', y = 'charges', hue = 'bmi_group')

sns.lmplot(df_smokers_m, x ='age', y = 'charges', hue = 'children')

sns.lmplot(df_smokers_m, x ='age', y = 'charges', hue = 'region')
sns.lmplot(df_smokers_m, x ='bmi', y = 'charges', hue = 'region')
    age   sex    bmi  children smoker     region      charges bmi_group
14   27  male  42.13         0    yes  southeast  39611.75770     obese
19   30  male  35.30         0    yes  southwest  36837.46700     obese
29   31  male  36.30         2    yes  southwest  38711.00000     obese
30   22  male  35.60         0    yes  southwest  35585.57600     obese
34   28  male  36.40         1    yes  southwest  51194.55914     obese
Out[43]:
<seaborn.axisgrid.FacetGrid at 0x26fe10b87d0>
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [44]:
df.head()
Out[44]:
age sex bmi children smoker region charges bmi_group
0 19 female 27.900 0 yes southwest 16884.92400 overweight
1 18 male 33.770 1 no southeast 1725.55230 obese
2 28 male 33.000 3 no southeast 4449.46200 obese
3 33 male 22.705 0 no northwest 21984.47061 normal
4 32 male 28.880 0 no northwest 3866.85520 overweight
In [45]:
df_se = df[df['region'] == 'southeast']
df_sw = df[df['region'] == 'southwest']
df_ne = df[df['region'] == 'northeast']
df_nw = df[df['region'] == 'northwest']

def lmplot_regions(df):
  sns.lmplot(df, x = 'bmi', y = 'charges', hue = 'sex')
  sns.lmplot(df, x = 'bmi', y = 'charges', hue = 'smoker')

  sns.lmplot(df, x = 'age', y = 'charges', hue = 'sex')
  sns.lmplot(df, x = 'age', y = 'charges', hue = 'smoker')
  sns.lmplot(df, x = 'age', y = 'charges', hue = 'bmi_group')
In [46]:
lmplot_regions(df_se)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [47]:
lmplot_regions(df_sw)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [48]:
lmplot_regions(df_ne)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [49]:
lmplot_regions(df_nw)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [50]:
sns.pairplot(df,kind = 'reg', height = 2, hue="smoker")
Out[50]:
<seaborn.axisgrid.PairGrid at 0x26fe3cfd6d0>
No description has been provided for this image
In [51]:
sns.pairplot(df_males,kind = 'reg', height = 2, hue="smoker")
Out[51]:
<seaborn.axisgrid.PairGrid at 0x26fe3fec8f0>
No description has been provided for this image
In [52]:
sns.pairplot(df_females,kind = 'reg', height = 2, hue="smoker")
Out[52]:
<seaborn.axisgrid.PairGrid at 0x26fe5b98500>
No description has been provided for this image
In [53]:
with sns.axes_style("whitegrid"):
    sns.scatterplot(x=df['age'], y=df['charges'])
    plt.xlabel('Age')
    plt.ylabel('Charges (USD)')
    plt.title('Scatter Plot of Charges by Age')

plt.show()
No description has been provided for this image
In [54]:
sns.scatterplot(x=df['age'], y=df['charges'], hue=df['children'], hue_norm=(0, 5), palette = 'viridis', legend="full")
plt.xlabel('Age')
plt.ylabel('Charges')
plt.title('Scatter Plot of Age and children vs. Charges')
plt.legend(title='Children', loc='upper right')

plt.show()
No description has been provided for this image
In [55]:
fig, ax = plt.subplots(figsize=(12, 8))
for i, col in enumerate([col for col in df.columns if df[col].nunique() <= 6], 1):
    plt.subplot(2, 3, i)

    ax = sns.boxplot(data=df, x=df[col].astype('category'), y='charges', showmeans = True, meanline = True, medianprops = dict(color = "red", linewidth = 1.5))
    plt.title(f'{col} vs charges')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [56]:
sns.catplot(data=df,
            x="bmi_group", y="charges", hue="smoker", kind="box", palette="viridis")

sns.despine(left=True)
plt.xlabel('BMI Group')
plt.ylabel('Charges')
plt.title('Charges by BMI and Smoking')
plt.show()
No description has been provided for this image
In [57]:
sns.catplot(data=df,
            x="region", y="charges", hue="smoker", kind="box", palette="viridis")

sns.despine(left=True)
plt.xlabel('region')
plt.ylabel('Charges')
plt.title('Charges by Region and Smoking')
plt.show()
No description has been provided for this image
In [58]:
fig , ax = plt.subplots()
df["children"].value_counts().plot(kind="bar",ax=ax,colormap="plasma")
ax.grid(axis="y",ls="--",color="gray")
ax.set(ylabel="counts",title="Number of Children");
No description has been provided for this image

Part 2 - Correlation¶

In [60]:
df_cleaned = df

df_cleaned['age_group'] = pd.cut(df_cleaned['age'], bins=[0, 25, 40, 60, df_cleaned['age'].max()], labels=['Young', 'Adult', 'Middle-aged', 'Senior'])
df_cleaned.sample(5)
Out[60]:
age sex bmi children smoker region charges bmi_group age_group
48 60 female 24.530 0 no southeast 12629.89670 normal Middle-aged
1055 36 male 28.595 3 no northwest 6548.19505 overweight Adult
1264 35 male 27.610 1 no southeast 4747.05290 overweight Adult
269 18 male 29.370 1 no southeast 1719.43630 overweight Young
37 26 male 20.800 0 no southwest 2302.30000 normal Adult
In [61]:
df_encoded = pd.get_dummies(df_cleaned, columns=['region'], prefix='region', dtype=int)
df_encoded.sample(5)
Out[61]:
age sex bmi children smoker charges bmi_group age_group region_northeast region_northwest region_southeast region_southwest
934 50 female 46.090 1 no 9549.56510 obese Middle-aged 0 0 1 0
1011 54 female 35.815 3 no 12495.29085 obese Middle-aged 0 1 0 0
54 40 female 28.690 3 no 8059.67910 overweight Adult 0 1 0 0
719 50 female 27.075 1 no 10106.13425 overweight Middle-aged 1 0 0 0
32 19 female 28.600 5 no 4687.79700 overweight Young 0 0 0 1
In [62]:
label_encoder = LabelEncoder()
df_encoded['smoker_encoded'] = label_encoder.fit_transform(df_encoded['smoker'])
df_encoded.sample(5)
Out[62]:
age sex bmi children smoker charges bmi_group age_group region_northeast region_northwest region_southeast region_southwest smoker_encoded
793 58 female 33.10 0 no 11848.14100 obese Middle-aged 0 0 0 1 0
1105 23 male 24.51 0 no 2396.09590 normal Young 1 0 0 0 0
333 64 male 34.50 0 no 13822.80300 obese Senior 0 0 0 1 0
139 34 male 22.42 2 no 27375.90478 normal Adult 1 0 0 0 0
1224 58 male 23.30 0 no 11345.51900 normal Middle-aged 0 0 0 1 0
In [63]:
df_encoded['sex_encoded'] = label_encoder.fit_transform(df_encoded['sex'])
df_encoded.sample(5)
Out[63]:
age sex bmi children smoker charges bmi_group age_group region_northeast region_northwest region_southeast region_southwest smoker_encoded sex_encoded
230 59 female 27.83 3 no 14001.28670 overweight Middle-aged 0 0 1 0 0 0
473 24 male 28.50 0 yes 35147.52848 overweight Young 1 0 0 0 1 1
1038 43 female 24.70 2 yes 21880.82000 normal Middle-aged 0 1 0 0 1 0
76 29 female 29.59 1 no 3947.41310 overweight Adult 0 0 1 0 0 0
1075 32 male 31.50 1 no 4076.49700 obese Adult 0 0 0 1 0 1
In [64]:
df_encoded = df_encoded[[x for x in df_encoded.columns if x not in ['smoker', 'sex']]]
df_encoded.sample(5)
Out[64]:
age bmi children charges bmi_group age_group region_northeast region_northwest region_southeast region_southwest smoker_encoded sex_encoded
269 18 29.370 1 1719.43630 overweight Young 0 0 1 0 0 1
262 19 36.955 0 36219.40545 obese Young 0 1 0 0 1 1
259 58 25.200 0 11837.16000 overweight Middle-aged 0 0 0 1 0 0
96 54 30.800 3 12105.32000 obese Middle-aged 0 0 0 1 0 0
206 35 27.740 2 20984.09360 overweight Adult 1 0 0 0 1 1
In [65]:
df_cat=df_encoded.drop(['bmi_group','age_group'], axis = 1)
df_cat.head()

print(df_cat.dtypes)

from sklearn import preprocessing
label = preprocessing.LabelEncoder()
age                   int64
bmi                 float64
children              int64
charges             float64
region_northeast      int32
region_northwest      int32
region_southeast      int32
region_southwest      int32
smoker_encoded        int32
sex_encoded           int32
dtype: object
In [66]:
plt.figure(figsize=(6,4))
sns.heatmap(df_cat.corr(),cmap='coolwarm',annot=True)
Out[66]:
<Axes: >
No description has been provided for this image
In [67]:
numr_cols = [x for x in df_encoded.columns if x not in ['age_group', 'bmi_group']]
corr_matrix_1 = df_encoded[numr_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix_1, annot=True, cmap = 'crest',
                mask=np.triu(corr_matrix_1),
                            annot_kws = {"size":8});

plt.title("Correlation Matrix")
plt.show()

threshold = 0.3
relevant_features = corr_matrix_1[(corr_matrix_1['charges'].abs() > threshold) & (corr_matrix_1.index != 'charges')].index.tolist()
print("Relevant features based on correlation:")
print(relevant_features)
No description has been provided for this image
Relevant features based on correlation:
['age', 'smoker_encoded']
In [68]:
df_cleaned_1 = df_cleaned.copy()
percetage_data(df_cleaned_1,['age_group','smoker'], sort=False)
Out[68]:
count percentage
age_group smoker
Young no 238 17.92
yes 64 4.82
Adult no 311 23.42
yes 83 6.25
Middle-aged no 439 33.06
yes 102 7.68
Senior no 69 5.20
yes 22 1.66

Part 3 - OLS Regression¶

In [70]:
XX = df_encoded[['age', 'sex_encoded', 'bmi', 'children', 'smoker_encoded', 'region_northeast',	'region_northwest',	'region_southeast',	'region_southwest']]# Features
yy = df_encoded['charges']

XX_train, XX_test, yy_train, yy_test = train_test_split(XX, yy, test_size=0.2, random_state=42)

linear_model = LinearRegression()
linear_model.fit(XX_train, yy_train)

yy_pred = linear_model.predict(XX_test)

mse = mean_squared_error(yy_test, yy_pred)
r2 = r2_score(yy_test, yy_pred)

(mse, r2)
Out[70]:
(35465549.24040005, 0.7448498340547971)
In [71]:
coefficients = pd.DataFrame(linear_model.coef_, XX.columns, columns=['Coefficient'])

coefficients.sort_values(by='Coefficient', ascending=False)
Out[71]:
Coefficient
smoker_encoded 23743.305766
region_northeast 856.698147
children 529.464610
bmi 349.241387
region_northwest 282.020100
age 259.030180
sex_encoded -38.688059
region_southeast -445.911307
region_southwest -692.806941
In [72]:
X = df_encoded[['age', 'sex_encoded', 'bmi', 'children', 'smoker_encoded']]
y = df_encoded['charges']

X = sm.add_constant(X)

est = sm.OLS(y, X).fit()

print(est.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                charges   R-squared:                       0.749
Model:                            OLS   Adj. R-squared:                  0.748
Method:                 Least Squares   F-statistic:                     787.2
Date:                Wed, 28 Aug 2024   Prob (F-statistic):               0.00
Time:                        14:14:38   Log-Likelihood:                -13440.
No. Observations:                1328   AIC:                         2.689e+04
Df Residuals:                    1322   BIC:                         2.692e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
==================================================================================
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const           -1.21e+04    964.609    -12.540      0.000    -1.4e+04   -1.02e+04
age              256.9833     11.878     21.636      0.000     233.682     280.285
sex_encoded      -35.1936    332.282     -0.106      0.916    -687.051     616.663
bmi              324.6392     28.152     11.532      0.000     269.411     379.867
children         476.3869    137.133      3.474      0.001     207.366     745.408
smoker_encoded  2.362e+04    411.853     57.346      0.000    2.28e+04    2.44e+04
==============================================================================
Omnibus:                      304.191   Durbin-Watson:                   2.088
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              727.859
Skew:                           1.235   Prob(JB):                    8.86e-159
Kurtosis:                       5.655   Cond. No.                         299.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [73]:
import statsmodels.api as sm
import numpy as np
from scipy.stats import skew, kurtosis

X = df_encoded[['age', 'sex_encoded', 'bmi', 'children', 'smoker_encoded']]
y = df_encoded['charges']

X = sm.add_constant(X)

est = sm.OLS(y, X).fit()

predicted_values = est.predict(X)

# Residuals
residuals = y - predicted_values

# Skewness and kurtosis of residuals
skewness = skew(residuals)
kurt = kurtosis(residuals)

print("Skewness of residuals:", skewness)
print("Kurtosis of residuals:", kurt)
Skewness of residuals: 1.235188517649841
Kurtosis of residuals: 2.6554309099204882
In [74]:
print('Parameters: ', est.params)
print()
print('R2: ', round(est.rsquared, 3))
print()
print('Standard errors: ', est.bse)
Parameters:  const            -12095.798145
age                 256.983299
sex_encoded         -35.193643
bmi                 324.639234
children            476.386936
smoker_encoded    23618.034063
dtype: float64

R2:  0.749

Standard errors:  const             964.609361
age                11.877707
sex_encoded       332.281726
bmi                28.152221
children          137.132593
smoker_encoded    411.852918
dtype: float64
In [75]:
prediction = est.predict()
In [76]:
plt.figure(figsize=(12, 5))

plt.plot(df_encoded['charges'], color='g', label='Original')
plt.plot(prediction, color='b', label='Prediction')

plt.title('Original vs Predicted Values')
plt.legend()
plt.show()
No description has been provided for this image
In [77]:
residuals = pd.Series(est.resid)
residuals[:5]
Out[77]:
0    -8577.429244
1    -2208.609180
2    -2757.334135
3    18264.079704
4    -1601.199679
dtype: float64
In [78]:
fig, ax = plt.subplots(1, 2, figsize=(12, 4))

residuals.plot(ax=ax[0])
residuals.plot(kind='density', ax=ax[1])

ax[0].set_title('Residual Plot')
ax[1].set_title('Density Plot')

plt.tight_layout()
plt.show()
No description has been provided for this image

Part 4 - Linear Regression and Decision Trees¶

In [80]:
X = df_encoded[['age', 'bmi', 'smoker_encoded']]
y = df_encoded['charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [81]:
from sklearn.linear_model import LinearRegression

lm = LinearRegression()
In [82]:
lm.fit(X_train,y_train)
Out[82]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [83]:
print('Intercept', lm.intercept_)
print('Coefficient', lm.coef_)
print('Score', lm.score(X_test, y_test))
Intercept -12009.42599594116
Coefficient [  262.06710911   328.98624301 23765.27695741]
Score 0.7488004023499725
In [84]:
print("Summary statistics of y_test:")
print(y_test.describe())
Summary statistics of y_test:
count      266.000000
mean     13125.146488
std      11811.996205
min       1242.260000
25%       4745.628900
50%       9172.281725
75%      15992.868725
max      49577.662400
Name: charges, dtype: float64
In [85]:
negative_values = y_test[y_test < 0]
print("Negative values in y_test:")
print(negative_values)
Negative values in y_test:
Series([], Name: charges, dtype: float64)
In [86]:
y_pred1 = lm.predict(X_test)
lm_mse = mean_squared_error(y_test, y_pred1)
print("Linear Regression - MSE: ", lm_mse)
lm_rmse = np.sqrt(lm_mse)
print("Linear Regression - RMSE: ", lm_rmse)
Linear Regression - MSE:  34916425.261267714
Linear Regression - RMSE:  5909.012206897843
In [87]:
ax1 = sns.histplot(y_test, kde=True, color="g", label="Actual Value", element='step')
sns.histplot(y_pred1, kde=True, color="b", label="Fitted Values", ax=ax1, element='step')

mean_y_test = y_test.mean()
plt.axvline(mean_y_test, color='g', linestyle='--', label='Mean of Actual Value')

mean_y_pred1 = y_pred1.mean()
plt.axvline(mean_y_pred1, color='b', linestyle='--', label='Mean of Fitted Values')

plt.title('Actual vs Fitted Values for Charges')
plt.xlabel('Charges')
plt.ylabel('Density')
plt.legend()
plt.show()
plt.close()
No description has been provided for this image
In [88]:
X = np.array(df_encoded[['age', 'bmi', 'smoker_encoded']])
y = np.array(df_encoded['charges'])

# Defining number of folds for cross-validation
n_splits = 3
kf = KFold(n_splits=n_splits)

# Initializing lists to store MSE scores
mse_scores = []

# Performing k-fold cross-validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Fitting linear regression model
    model1 = LinearRegression()
    model1.fit(X_train, y_train)
    
    # Predicting on test set
    y_pred = model1.predict(X_test)
    
    # Calculating MSE
    mse = mean_squared_error(y_test, y_pred)
    mse_scores.append(mse)

# Calculating average MSE across all folds
avg_mse = np.mean(mse_scores)
print("Average MSE:", avg_mse)

lr_rmse = np.sqrt(avg_mse)
print("Linear Regression - RMSE: ", lr_rmse)
Average MSE: 36713110.02551001
Linear Regression - RMSE:  6059.134428737327
In [89]:
X = np.array(df_encoded[['age', 'bmi', 'smoker_encoded']])
y = np.array(df_encoded['charges'])

# Defining the range of training set sizes for the learning curve
train_sizes = np.linspace(0.1, 1.0, 10) 

# Defining the linear regression model
model = LinearRegression()

# Learning curves
train_sizes_abs, train_scores, test_scores = learning_curve(
    estimator=model, X=X, y=y, train_sizes=train_sizes, cv=5, scoring='neg_mean_squared_error')

# Calculatinh mean and standard deviation of training and test scores
train_scores_mean = -np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = -np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

# Plotting learning curves
plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes_abs, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1, color="b")
plt.fill_between(train_sizes_abs, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="r")
plt.plot(train_sizes_abs, train_scores_mean, 'o-', color="b", label="Training score")
plt.plot(train_sizes_abs, test_scores_mean, 'o-', color="r", label="Cross-validation score")
plt.xlabel("Training examples")
plt.ylabel("Negative Mean Squared Error")
plt.title("Learning Curves (Linear Regression)")
plt.legend(loc="best")
plt.grid(True)
plt.show()
No description has been provided for this image
In [90]:
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)
dt_mse = mean_squared_error(y_test, dt_predictions)
dt_mae = mean_absolute_error(y_test, dt_predictions)

plt.figure(figsize=(6, 4))
plt.scatter(y_test, dt_predictions, color='green', cmap = 'crest', label='Decision Tree')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.title('Decision Tree: Actual vs. Predicted')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.legend()
plt.show()

rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_mae = mean_absolute_error(y_test, rf_predictions)

plt.figure(figsize=(8, 4))
plt.scatter(y_test, rf_predictions, color='orange', label='Random Forest')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.title('Random Forest: Actual vs. Predicted')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.legend()
plt.show()

gb_model = GradientBoostingRegressor()
gb_model.fit(X_train, y_train)
gb_predictions = gb_model.predict(X_test)
gb_mse = mean_squared_error(y_test, gb_predictions)
gb_mae = mean_absolute_error(y_test, gb_predictions)

plt.figure(figsize=(8, 4))
plt.scatter(y_test, gb_predictions, color='purple', label='Gradient Boosting')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.title('Gradient Boosting: Actual vs. Predicted')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.legend()
plt.show()
C:\Users\chuch\AppData\Local\Temp\ipykernel_15788\2615700504.py:8: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  plt.scatter(y_test, dt_predictions, color='green', cmap = 'crest', label='Decision Tree')
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [91]:
print("Decision Tree - MSE: ", dt_mse)
print("Decision Tree - MAE: ", dt_mae)

print("Random Forest - MSE: ", rf_mse)
print("Random Forest - MAE: ", rf_mae)

print("Gradient Boosting - MSE: ", gb_mse)
print("Gradient Boosting - MAE: ", gb_mae)

dt_rmse = np.sqrt(dt_mse)
print("Decision Tree Regression - RMSE: ", dt_rmse)
Decision Tree - MSE:  42607619.81369367
Decision Tree - MAE:  3441.604622364253
Random Forest - MSE:  28891677.399929322
Random Forest - MAE:  2880.297838077721
Gradient Boosting - MSE:  24215052.39802169
Gradient Boosting - MAE:  2631.8425987205937
Decision Tree Regression - RMSE:  6527.451249430644
In [92]:
# Residuals
residuals = y_test - dt_predictions

# Computing skewness and kurtosis of residuals
skewness = skew(residuals)
kurt = kurtosis(residuals)

print("Skewness of residuals:", skewness)
print("Kurtosis of residuals:", kurt)
Skewness of residuals: 0.31645991798766737
Kurtosis of residuals: 4.006586691289017
In [93]:
new_data = pd.DataFrame({'age': [30], 'bmi': [32.10], 'smoker_encoded': [1]})

dt_predictions = dt_model.predict(new_data)
print("Decision Tree Predictions:", dt_predictions)

rf_predictions = rf_model.predict(new_data)
print("Random Forest Predictions:", rf_predictions)

gb_predictions = gb_model.predict(new_data)
print("Gradient Boosting Predictions:", gb_predictions)
Decision Tree Predictions: [37079.372]
Random Forest Predictions: [36774.7981338]
Gradient Boosting Predictions: [38282.43596733]
C:\Users\chuch\anaconda3\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but DecisionTreeRegressor was fitted without feature names
  warnings.warn(
C:\Users\chuch\anaconda3\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but RandomForestRegressor was fitted without feature names
  warnings.warn(
C:\Users\chuch\anaconda3\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but GradientBoostingRegressor was fitted without feature names
  warnings.warn(
In [94]:
X2 = np.array(df_encoded[['age', 'bmi', 'smoker_encoded']])
y2 = np.array(df_encoded['charges'])

# Defining number of folds for cross-validation
n_splits = 50
kf = KFold(n_splits=n_splits)

# Initializing lists to store MSE scores
mse2_scores = []

# Performing k-fold cross-validation
for train_index, test_index in kf.split(X2):
    X2_train, X2_test = X2[train_index], X2[test_index]
    y2_train, y2_test = y2[train_index], y2[test_index]
    
    # Fitting decision tree model
    model = DecisionTreeRegressor()
    model.fit(X2_train, y2_train)
    
    # Predicting on test set
    y2_pred = model.predict(X2_test)
    
    # Calculating MSE
    mse2 = mean_squared_error(y2_test, y2_pred)
    mse2_scores.append(mse2)

# Calculating average MSE across all folds
avg2_mse = np.mean(mse2_scores)
print("Average MSE:", avg2_mse)

dt_rmse = np.sqrt(avg2_mse)
print("Decision Tree - RMSE: ", dt_rmse)
Average MSE: 43138950.44409099
Decision Tree - RMSE:  6568.024851056137
In [95]:
X5 = np.array(df_encoded[['age', 'bmi', 'smoker_encoded']])
y5 = np.array(df_encoded['charges'])

# Defining the range of training set sizes for the learning curve
train_sizes = np.linspace(0.1, 1.0, 10)  # 10 equally spaced training set sizes from 10% to 100%

# Defining the decision tree regressor model
model = DecisionTreeRegressor()

# Learning curves
train_sizes_abs, train_scores, test_scores = learning_curve(
    estimator=model, X=X5, y=y5, train_sizes=train_sizes, cv=5, scoring='neg_mean_squared_error')

# Plotting learning curves
plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes_abs, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1, color="b")
plt.fill_between(train_sizes_abs, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="r")
plt.plot(train_sizes_abs, train_scores_mean, 'o-', color="b", label="Training score")
plt.plot(train_sizes_abs, test_scores_mean, 'o-', color="r", label="Cross-validation score")
plt.xlabel("Training examples")
plt.ylabel("Negative Mean Squared Error")
plt.title("Learning Curves (Decision Tree Regressor)")
plt.legend(loc="best")
plt.grid(True)
plt.show()
No description has been provided for this image
In [96]:
X = np.array(df_encoded[['age', 'bmi', 'smoker_encoded']])
y = np.array(df_encoded['charges'])

train_sizes = np.linspace(0.1, 1.0, 10) 

# Defining the random forest regressor model
model = RandomForestRegressor()

# Learning curves
train_sizes_abs, train_scores, test_scores = learning_curve(
    estimator=model, X=X, y=y, train_sizes=train_sizes, cv=5, scoring='neg_mean_squared_error')

# Plotting learning curves
plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes_abs, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1, color="b")
plt.fill_between(train_sizes_abs, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="r")
plt.plot(train_sizes_abs, train_scores_mean, 'o-', color="b", label="Training score")
plt.plot(train_sizes_abs, test_scores_mean, 'o-', color="r", label="Cross-validation score")
plt.xlabel("Training examples")
plt.ylabel("Negative Mean Squared Error")
plt.title("Learning Curves (Random Forest)")
plt.legend(loc="best")
plt.grid(True)
plt.show()
No description has been provided for this image
In [97]:
X = np.array(df_encoded[['age', 'bmi', 'smoker_encoded']])
y = np.array(df_encoded['charges'])

train_sizes = np.linspace(0.1, 1.0, 10) 

# Defining the gradient boosting regressor model
model = GradientBoostingRegressor()

# Generating learning curves
train_sizes_abs, train_scores, test_scores = learning_curve(
    estimator=model, X=X, y=y, train_sizes=train_sizes, cv=5, scoring='neg_mean_squared_error')

# Plotting learning curves
plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes_abs, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1, color="b")
plt.fill_between(train_sizes_abs, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="r")
plt.plot(train_sizes_abs, train_scores_mean, 'o-', color="b", label="Training score")
plt.plot(train_sizes_abs, test_scores_mean, 'o-', color="r", label="Cross-validation score")
plt.xlabel("Training examples")
plt.ylabel("Negative Mean Squared Error")
plt.title("Learning Curves (Gradient Boost)")
plt.legend(loc="best")
plt.grid(True)
plt.show()
No description has been provided for this image
In [98]:
X6 = np.array(df_encoded[['age', 'bmi', 'smoker_encoded']])
y6 = np.array(df_encoded['charges'])

train_sizes = np.linspace(0.1, 1.0, 10) 

# Defining the degree of polynomial features
degree = 3

# Creating a pipeline for polynomial regression
model = make_pipeline(PolynomialFeatures(degree), LinearRegression())

# Generating learning curves
train_sizes_abs, train_scores, test_scores = learning_curve(
    estimator=model, X=X6, y=y6, train_sizes=train_sizes, cv=5, scoring='neg_mean_squared_error')

# Calculating mean and standard deviation of training and test scores
train_scores_mean = -np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = -np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

# Plotting learning curves
plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes_abs, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1, color="b")
plt.fill_between(train_sizes_abs, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="r")
plt.plot(train_sizes_abs, train_scores_mean, 'o-', color="b", label="Training score")
plt.plot(train_sizes_abs, test_scores_mean, 'o-', color="r", label="Cross-validation score")
plt.xlabel("Training examples")
plt.ylabel("Negative Mean Squared Error")
plt.title("Learning Curves (Polynomial Regression)")
plt.legend(loc="best")
plt.grid(True)
plt.show()
No description has been provided for this image